Introduction

The purpose of this document is to examine codon usage patterns at the genome-wide level and for user-defined gene-sets-of-interest, across four Strongyloides spp. and C. elegans.

For the genes-of-interest, four groups are analyzed:
1) Top 1% of codon adapted genes in each genome
2) Members of the astacin gene family
3) Members of the SCP/TAP gene family
4) Chemoreceptor genes, including C. elegans and Strongyloides spp.

The astacin and SCP/TAP gene families are analyzed because members of these families dominate lists of genes that are upregulated in parasitic female life stages (see: Hunt et al 2016).

Data Preprocessing

Full lists of CDS sequences for S. stercoralis, S. ratti, S. papillosus, S. venezuelensis and C. elegans were download from Wormbase ParaSite on October 1, 2020. The .fa files were used as input to the Strongyloides Codon Adapter App. For each species an excel report containing the computed codon adaptation index values relative to S. ratti and C. elegans was downloaded; these file are uploaded the code chunks below.

Analysis/Results

Filter Genome Data

For each species, load data listing GC content and CAI indeces for the full list of CDS elements in genome. The full list contains transcript level information. In cases where there are multiple transcripts per gene, only use CAI values from the longest transcript. If this code is run, a new copy of the data will be saved at the end of the Markdown file. If downstream tidy version of this data is avliable or the filtered versions are avaliable as files, don’t regenerate them.

if (!file.exists(c("./Data/Genomic/tidy_genomic_CAI.csv"))) {
  if (!all(file.exists(c("./Data/Genomic/Ss_Genomic_CAI_Table.xlsx",
                         "./Data/Genomic/Sr_Genomic_CAI_Table.xlsx",
                         "./Data/Genomic/Sp_Genomic_CAI_Table.xlsx",
                         "./Data/Genomic/Sv_Genomic_CAI_Table.xlsx",
                         "./Data/Genomic/Ce_Genomic_CAI_Table.xlsx")))) {
    
    temp <- c(Ss = './Data/Genomic/Ss_Codon_Usage_Report.xlsx',
              Sr = './Data/Genomic/Sr_Codon_Usage_Report.xlsx',
              Sp = './Data/Genomic/Sp_Codon_Usage_Report.xlsx',
              Sv = './Data/Genomic/Sv_Codon_Usage_Report.xlsx',
              Ce = './Data/Genomic/Ce_Codon_Usage_Report.xlsx')
    dat.genomic <- sapply(temp, read_excel, 
                          sheet = 1, 
                          skip = 3, 
                          col_names = T, 
                          simplify = F, 
                          USE.NAMES = T)
    # This data contains transcript level information. In cases where there are 
    # multiple transcripts per gene, need to only include 
    # values from the longest transcript.
    dat.genomic.cleaned <- lapply(dat.genomic, function(x){
      x %>%
        dplyr::rename(transcriptID = geneID) %>%
        dplyr::mutate(geneID = str_remove_all(transcriptID, "\\.[0-9]$")) %>%
        dplyr::mutate(geneID = str_remove_all(geneID, "[a-c]$")) %>%
        dplyr::group_by(geneID) %>%
        dplyr::slice_max(n = 1, order_by = str_length(`cDNA sequence`), 
                         with_ties = FALSE) %>%
        dplyr::relocate(geneID, .before = everything())
      
    })
    
  }
}

Tidy Genome Data

Tidy the genomic CAI values for downstream plotting and analysis. This can be computationally intensive, so if the code is run, a tidy version will get saved at the end of the Markdown file as a .csv file. If that file exists, don’t re-run this code.

if (!file.exists(c("./Data/Genomic/tidy_genomic_CAI.csv"))) {
  temp <- c("./Data/Genomic/Ss_Genomic_CAI_Table.xlsx",
            "./Data/Genomic/Sr_Genomic_CAI_Table.xlsx",
            "./Data/Genomic/Sp_Genomic_CAI_Table.xlsx",
            "./Data/Genomic/Sv_Genomic_CAI_Table.xlsx",
            "./Data/Genomic/Ce_Genomic_CAI_Table.xlsx")
  dat.genomic.cleaned <- sapply(temp, read_excel, 
                                sheet = 1, 
                                col_names = T, 
                                simplify = F, 
                                USE.NAMES = T)
  names(dat.genomic.cleaned) <- c("Ss", "Sr", "Sp", "Sv", "Ce")
  
  dat.genomic.df <- dat.genomic.cleaned %>%
    map("geneID") %>%
    unlist() %>%
    as_tibble_col(column_name = "geneID") %>%
    group_by(geneID)
  
  dat.genomic.df <- dat.genomic.cleaned %>%
    map("transcriptID") %>%
    unlist() %>%
    as_tibble_col(column_name = "transcriptID") %>%
    add_column(dat.genomic.df, .)
  
  dat.genomic.df <- dat.genomic.cleaned %>%
    map("Sr_CAI") %>%
    unlist() %>%
    as_tibble_col(column_name = "Sr_CAI") %>%
    add_column(dat.genomic.df, .)
  
  dat.genomic.df <- dat.genomic.cleaned %>%
    map("Ce_CAI") %>%
    unlist() %>%
    as_tibble_col(column_name = "Ce_CAI") %>%
    add_column(dat.genomic.df, .)
  
  dat.genomic.df <- dat.genomic.cleaned %>%
    map("GC") %>%
    unlist() %>%
    as_tibble_col(column_name = "GC") %>%
    add_column(dat.genomic.df, .)
  
  # Add the column that species which species the gene is from
  # by matching to the original lists of genes, 
  # the group
  dat.genomic.df <- dat.genomic.df %>%
    dplyr::mutate(species = case_when(
      transcriptID %in% dat.genomic$Ss$geneID ~ 'Ss',
      transcriptID %in% dat.genomic$Sr$geneID ~ 'Sr',
      transcriptID %in% dat.genomic$Sp$geneID ~ 'Sp',
      transcriptID %in% dat.genomic$Sv$geneID ~ 'Sv',
      transcriptID %in% dat.genomic$Ce$geneID ~ 'Ce',
    ))
  dat.genomic.df <- group_by(dat.genomic.df, species)
  
}

CAI: Genome-wide Distribution Plot

This section takes the calculated codon adaptation index values for all predicted coding sequences, and generates a density curve for each species. Ideally, this plot could be used to benchmark the calculated CAI values against other metrics of codon bias in these species. For example, Mitreva et al 2006 reported effective number of codons (ENC) as a measure of gene-wise codon bias, across species. They report that “mean ENC across all sampled nemtaode species is 46.7 +/- 5.1” but that S. stercoralis and S. ratti are outliers with low ENC values (~35). Note: ENC values range from 20 for highly biased to 61 for no bias. Thus, benchmarking from Mitreva et al 2006, we might expect that Strongyloides species will have greater codon bias than C. elegans. Unfortunately, although they do report measuring distribution of ENC values across all transcripts for each species, they only have plots for S. ratti, P. trichosuri and P. pacificus.

dat.genomic.df <- suppressWarnings(
  read.csv("./Data/Genomic/tidy_genomic_CAI.csv", 
           header = TRUE)) %>%
  as_tibble() 

species.specific.dat.df <- dat.genomic.df %>%
  dplyr::mutate(species_CAI = case_when(
    species == 'Ce' ~ Ce_CAI,
    species != 'Ce' ~ Sr_CAI
  )) %>%
  dplyr:: select(geneID, species_CAI, species) %>%
  group_by(species)


plot_distributions <- function(dat,
                               x,
                               title,
                               xlabel,
                               caption = "",
                               ylabel = "Density (scaled to maximum of 1)",
                               xlim = c(0,1),
                               ylim = c(0,1)) {
  dat %>%
    ggplot(aes(x = x, y = after_stat(ndensity), color = species)) +
    geom_freqpoly(bins = 50) +
    scale_color_manual(values = c("seagreen4", "steelblue4", 
                                  "coral4", "darkgoldenrod4", 
                                  "darkorchid4"),
                       name = "Species",
                       labels = c("C. elegans", "S. papillosus", 
                                  "S. ratti", "S. stercoralis",
                                  "S. venezuelensis")) +
    labs(title = title,
         subtitle = "Data includes all predicted coding sequences",
         x = xlabel,
         y = ylabel,
         caption = caption) +
    coord_equal(xlim = xlim, ylim = ylim) +
    theme_bw() +
    theme(plot.title.position = "plot",
          plot.caption.position = "panel",
          plot.title = element_text(face = "bold",
                                    size = 10, hjust = 0),
          plot.subtitle = element_text(size = 8),
          legend.title = element_text(size = 8),
          axis.title = element_text(size = 8),
          axis.text = element_text(size = 8),
          axis.text.x = element_text(angle = 45, hjust = 1))
}
species_adaptiveness_plot <- plot_distributions(species.specific.dat.df,
                                                species.specific.dat.df$species_CAI,
                                                "Species-wide codon adaptiveness",
                                                "Codon bias relative to \n genus coding usage rules (CAI)",
                                                "Strongyloides species values relative to S. ratti usage rules
       C. elegans values relative to C. elegans usage rules.")

species_adaptiveness_plot

Notes re: the above plot - C. elegans shows overall lower codon bias compared to the Strongyloides species, as predicted from the Mitreva et al 2006 results. Reminder that Mitreva et al reported that this difference is greatly affected by the GC content; more AT rich speices like the Strongyloides species have greater codon usage biases. Also, it looks like the Strongyloides subclades (S. ratti - S. stercoralis; S. papillosus - S. venezuelensis) cluster together.

Kruskal-Wallis Test with Dunn’s Post tests for differences in genomic CAI values across species

# Kruskal-Wallis testing of genomic CAI
one.way <- kruskal.test(species_CAI ~ species, data = species.specific.dat.df)

# Posthoc Dunn tests with Bonferroni adjustment
post.hoc <- dunn.test::dunn.test(species.specific.dat.df$species_CAI, species.specific.dat.df$species, method = "bonferroni")
##   Kruskal-Wallis rank sum test
## 
## data: x and group
## Kruskal-Wallis chi-squared = 34026.9091, df = 4, p-value = 0
## 
## 
##                            Comparison of x by group                            
##                                  (Bonferroni)                                  
## Col Mean-|
## Row Mean |         Ce         Sp         Sr         Ss
## ---------+--------------------------------------------
##       Sp |  -88.77112
##          |    0.0000*
##          |
##       Sr |  -164.1564  -89.61776
##          |    0.0000*    0.0000*
##          |
##       Ss |  -147.3374  -70.52420   18.64147
##          |    0.0000*    0.0000*    0.0000*
##          |
##       Sv |  -87.70627  -0.637776   87.42893   68.63426
##          |    0.0000*     1.0000    0.0000*    0.0000*
## 
## alpha = 0.05
## Reject Ho if p <= alpha/2

GC: Genome-wide Distribution Plot

This section takes the calculated GC content values for all predicted coding sequences, and generates a density curve for each species.

species.specific.GC.df <- dat.genomic.df %>%
  dplyr:: select(geneID, GC, species) %>%
  group_by(species)



species_GC_plot <- plot_distributions(species.specific.GC.df,
                                      species.specific.GC.df$GC,
                                      "Species-wide GC content",
                                      "GC content")

species_GC_plot

Kruskal-Wallis Test with Dunn’s Post tests for differences in genomic GC values across species

# Kruskal-Wallis testing of genomic GC
one.way <- kruskal.test(GC ~ species, data = species.specific.GC.df)

# Posthoc Dunn tests with Bonferroni adjustment
post.hoc <- dunn.test::dunn.test(species.specific.GC.df$GC, species.specific.GC.df$species, method = "bonferroni")
##   Kruskal-Wallis rank sum test
## 
## data: x and group
## Kruskal-Wallis chi-squared = 34478.504, df = 4, p-value = 0
## 
## 
##                            Comparison of x by group                            
##                                  (Bonferroni)                                  
## Col Mean-|
## Row Mean |         Ce         Sp         Sr         Ss
## ---------+--------------------------------------------
##       Sp |   115.6228
##          |    0.0000*
##          |
##       Sr |   163.3622   62.33462
##          |    0.0000*    0.0000*
##          |
##       Ss |   156.6251   53.74131  -8.686481
##          |    0.0000*    0.0000*    0.0000*
##          |
##       Sv |   121.7444   9.030361  -53.06919  -44.48675
##          |    0.0000*    0.0000*    0.0000*    0.0000*
## 
## alpha = 0.05
## Reject Ho if p <= alpha/2

Analyze Codon Adaptation Index (CAI) values displayed by gene sets of interest

Loading Hunt et al 2016 Ensembl Compara database, so I can use those gene sets to select genes of interest.

dat.Ensembl.IDs <- read_excel('./Data/Genomic/Hunt_Parasite_Ensembl_Compara.xlsx', 
                              sheet = 1, 
                              skip = 0, 
                              col_names = T) %>%
  pivot_longer(-`Compara family id`,
               values_to = "geneIDs") %>%
  dplyr::select(c(`Compara family id`, geneIDs)) %>%
  dplyr::rename(familyID = c(`Compara family id`), geneID = geneIDs) %>%
  dplyr::mutate(geneID = str_remove_all(geneID, "\\.[0-9]$")) %>%
  dplyr::mutate(geneID = str_remove_all(geneID, "[a-c]$"))

dat.Ensembl.families <- read_excel('./Data/Genomic/Hunt_Parasite_Ensembl_Compara.xlsx', 
                                   sheet = 2, 
                                   skip = 0, 
                                   col_names = T) %>%
  dplyr::select(c(`Compara family id`, `Description`))%>%
  dplyr::rename(familyID = c(`Compara family id`))

Extract List of Strongyloides SCP/TAPS protein feamily

Get list of geneIDs associated with SCP/TAPs gene sets, use to pull CAI values from genome-wide list. This gene family chosen because it is one of two gene families that dominate lists of genes that are upregulated in parasitic female life stages.

dat.SCPTAPS <- dat.Ensembl.families %>%
  dplyr::filter(str_detect(Description, "SCP/TAP")) %>%
  left_join(dat.Ensembl.IDs, by = "familyID") %>%
  dplyr::filter(str_detect(geneID, "SVE|SPAL|SSTP|SRAE")) %>%
  left_join(dat.genomic.df, by = "geneID")%>%
  dplyr::mutate(Description = factor(Description)) %>%
  dplyr::group_by(Description, species) %>%
  dplyr::select(geneID, Sr_CAI, species) %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv"))) 

Extract List of Strongyloides astacin genes

Get list of geneIDs associated with astacin gene sets, use to pull CAI values from genome-wide list. This gene family chosen because it is one of two gene families that dominate lists of genes that are upregulated in parasitic female life stages.

dat.astacin <- dat.Ensembl.families %>%
  dplyr::filter(str_detect(Description, "Astacin")) %>%
  left_join(dat.Ensembl.IDs, by = "familyID") %>%
  dplyr::filter(str_detect(geneID, "SVE|SPAL|SSTP|SRAE")) %>%
  left_join(dat.genomic.df, by = "geneID")%>%
  dplyr::mutate(Description = factor(Description)) %>%
  dplyr::group_by(Description, species) %>%
  dplyr::select(geneID, Sr_CAI, species) %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv")))

Load Chemoreceptor Data

For chemoreceptors, load data listing GC content and CAI indeces.

temp <- c(
  Ss_GoI = './Data/Chemoreceptors/Ss_Chemoreceptor_Codon_Usage_Report.xlsx',
  Sr_GoI = './Data/Chemoreceptors/Sr_Chemoreceptor_Codon_Usage_Report.xlsx',
  Sv_GoI = './Data/Chemoreceptors/Sv_Chemoreceptor_Codon_Usage_Report.xlsx',
  Sp_GoI = './Data/Chemoreceptors/Sp_Chemoreceptor_Codon_Usage_Report.xlsx')

dat.Ss.GoI <- read_excel(temp[["Ss_GoI"]], 
                         sheet = 1, 
                         skip = 3, 
                         col_names = T) %>%
  dplyr::select(!c(GC,`cDNA sequence`)) %>%
  dplyr::mutate(species = "Ss")

dat.Sr.GoI <- read_excel(temp[["Sr_GoI"]], 
                         sheet = 1, 
                         skip = 3, 
                         col_names = T) %>%
  dplyr::select(!c(GC,`cDNA sequence`)) %>%
  dplyr::mutate(species = "Sr")

dat.Sv.GoI <- read_excel(temp[["Sv_GoI"]], 
                         sheet = 1, 
                         skip = 3, 
                         col_names = T) %>%
  dplyr::select(!c(GC,`cDNA sequence`)) %>%
  dplyr::mutate(species = "Sv")

dat.Sp.GoI <- read_excel(temp[["Sp_GoI"]], 
                         sheet = 1, 
                         skip = 3, 
                         col_names = T) %>%
  dplyr::select(!c(GC,`cDNA sequence`)) %>%
  dplyr::mutate(species = "Sp")


dat.chemoR <- rbind(dat.Ss.GoI, 
                    dat.Sr.GoI, 
                    dat.Sv.GoI, 
                    dat.Sp.GoI) %>%
  dplyr:: select(geneID, Sr_CAI, species) %>%
  dplyr::mutate(Description = factor("Chemoreceptor")) %>%
  dplyr::group_by(Description, species) %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv")))


dat.chemoR$geneID <- str_remove_all(dat.chemoR$geneID, "\\.[0-9]$")
dat.chemoR$geneID <- str_remove_all(dat.chemoR$geneID, "[a-z]$")

Pull Top 1% of Codon Adapted Genes

Take genome-wide CAI values, and for each species determine what the top 1% of codon adapted genes are. Will plot these relative to identified gene sets/families of interest.

dat.TopOne <- dat.genomic.df %>%
  dplyr::filter(species != "Ce") %>%
  dplyr::select(geneID, Sr_CAI, species) %>%
  dplyr::mutate(Description = factor("Top1")) %>%
  dplyr::group_by(Description, species) %>%
  dplyr::slice_max(order_by = Sr_CAI, prop = .01) %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv")))

Randomly Sample of from full list of genomic CAI values

Take genome-wide CAI values, and take a random sample that’s the approximate size of the other datasets. Will plot these relative to identified gene sets/families of interest.

dat.randomSample <- dat.genomic.df %>%
  dplyr::filter(species != "Ce") %>%
  dplyr::select(geneID, Sr_CAI, species) %>%
  dplyr::mutate(Description = factor("RandomSample")) %>%
  dplyr::group_by(Description, species) %>%
  dplyr::sample_frac(.01) %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv")))

Plot Chemoreceptors, Top1, Astacins, and SCP/TAP CAI values

Generate a series of plots (separated by species ID) where for each gene grouping the codon adaptation index relative to highly expressed S. ratti transcripts is plotted.

# Generate genome-wide Sr_CAI values that can be merged with the genes-of-interest datasets
dat.genome <- dat.genomic.df %>%
  dplyr::filter(species != "Ce") %>%
  dplyr::select(geneID, Sr_CAI, species) %>%
  dplyr::mutate(Description = factor("Genome-wide")) %>%
  dplyr::group_by(Description, species) %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv")))


tbl <- dat.SCPTAPS %>%
  dplyr::select(Description, geneID, Sr_CAI, species) %>%
  rbind(.,dat.chemoR, dat.TopOne, dat.astacin, dat.randomSample) %>%
  dplyr::mutate(Description = factor(Description, levels = c("RandomSample", "Astacin", "SCP/TAP", "Chemoreceptor", "Top1")))

cai_plot <- ggplot(tbl, aes(Description,Sr_CAI, species)) +
  #geom_violin(mapping = aes(color = Description), trim = FALSE, alpha = 0.7) +
  geom_jitter(mapping = aes(color = Description), shape = 1) +
  stat_summary(fun = "median",
               geom = "point",
               shape = 20,
               size = 2,
               color = "black",
               show.legend = FALSE) +
  scale_color_hue (l=40)+
  facet_wrap(~species) +
  labs(title = "Species-specific codon adaptiveness",
       subtitle = "Random sample of 1% of genomic values, SCP/TAP genes & Astacins (i.e. parasitism-associated genes), Chemoreceptors genes, Top 1% of codon adapted genes",
       x = "Gene Set Identity",
       y = "Codon Bias relative to \n S. ratti usage rules (CAI)") +
  theme_bw() +
  
  theme(plot.title.position = "plot",
        plot.title = element_text(face = "bold",
                                  size = 8, hjust = 0),
        axis.title = element_text(size = 8),
        axis.text = element_text(size = 8),
        text = element_text(size = 8),
        title = element_text(size = 8),
        axis.text.x = element_text(angle = 45, hjust = 1))

CAI values across gene families

Gene groups plotted here include chemoreceptors, top 1% of codon adapted genes in genome, and members of the Strongyloides genome project families.

cai_plot

Functional Enrichment Analysis of Top1 Genes

Take lists of the top 1% of codon adapted genes in each species, and run either GO analysis or GSEA to see if these genes are enriched for anything interesting.

# Carry out GO enrichment of discarded gene set using gProfiler2 ----
Top1.geneID <- dat.TopOne %>%
  ungroup() %>%
  group_by(species) %>%
  dplyr::group_map(~ {
    gost.res <- .x %>%
      dplyr::select(geneID, Sr_CAI) %>%
      unique() 
  }, .keep = TRUE)

names(Top1.geneID) <- levels(dat.TopOne$species)

gost.results <- lapply(names(Top1.geneID), function(y){
  organism = case_when (y == "Ss" ~ "ststerprjeb528",
                        y == "Sr" ~ "strattprjeb125",
                        y == "Sp" ~ "stpapiprjeb525",
                        y == "Sv" ~ "stveneprjeb530")
  gost.res <- gost(Top1.geneID[[y]]$geneID,
                   organism = organism, 
                   correction_method = "fdr")
})
names(gost.results) <- names(Top1.geneID)

# Find GO terms that are enriched with p-values equal or small than a threshold.
enriched.terms <- lapply(names(Top1.geneID), function(y){
  gost.results[[y]]$result %>%
    as_tibble() %>%
    dplyr::select(term_id, p_value)
}) 
names(enriched.terms) <- names(Top1.geneID)

# Select enriched GO terms that are common to all species
highTerms.overlap <- enriched.terms %>%
  bind_rows(.id = "species") %>%
  dplyr::filter(p_value <= 0.01) %>%
  group_by(term_id) %>%
  summarize(n= n()) %>%
  dplyr::filter(n >= 4)

Interactive Manhattan Plots

S. stercoralis

gostplot(gost.results$Ss, interactive = T, capped = T) 

S. ratti

gostplot(gost.results$Sr, interactive = T, capped = T) 

S. papillosus

gostplot(gost.results$Sp, interactive = T, capped = T) 

S. venezuelensis

gostplot(gost.results$Sv, interactive = T, capped = T) 

I found GO terms that are commonly enriched in all Strongyloides species tested.

dplyr::filter(gost.results$Ss$result, term_id %in% highTerms.overlap$term_id) %>%
  dplyr::select(term_id, term_name)

Save Files and Plots

Saving data and plots generated by the above code.

# Save cleaned versions of the genomic CAI value datasets
if (exists("dat.genomic.cleaned")) {
  suppressMessages(lapply(names(dat.genomic.cleaned), function(x){
    y <- dat.genomic.cleaned[[x]]
    write.xlsx(y, 
               file = paste0("./Data/Genomic/", x,"_Genomic_CAI_Table.xlsx")
    )
    
  }))
}

# Save a tidy version of the complied genomic CAI value dataset if it doesn't exist
if (!file.exists(c("./Data/Genomic/tidy_genomic_CAI.csv"))) {
  write.table(dat.genomic.df, 
              file = "./Data/Genomic/tidy_genomic_CAI.csv", 
              sep = ",", 
              col.names = T, 
              row.names = FALSE)
  
}

# Save distribution plot of codon adaptivness by species
ggsave('./Plots/Codon_Adaptiveness_Distributions_by_Species.pdf', 
       plot = species_adaptiveness_plot, 
       width = 5.5, height = 4, 
       units = "in", device = "pdf", 
       useDingbats = FALSE)

# Save distribution plot of GC content by species
ggsave('./Plots/GC_Distributions_by_Species.pdf', 
       plot = species_GC_plot, 
       width = 5.5, height = 4, 
       units = "in", device = "pdf", 
       useDingbats = FALSE)

# Save .csv lists of CAI values for functional datasets
lapply(levels(tbl$species), function(y){
   dplyr::filter(tbl, species == y) %>%
     pivot_wider(names_from = Description,
                 values_from = Sr_CAI) %>%
     
   write.csv(paste0("./Data/",y,"_FunctionGroupCAI.csv"))
  
})
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
# Save .csv lists of 1% of codon adapted genes per species
Top1.geneID <- dat.TopOne %>%
  ungroup() %>%
  group_by(species) %>%
  dplyr::group_map(~ {
    geneIDs <- .x %>%
      dplyr::select(geneID) %>%
      write.csv(paste0("./Data/Genomic/",.y$species,"_Top1_CAI.csv"))
  }, .keep = TRUE)


# Save plot of codon adaptivness in gene sets of interest
ggsave('./Plots/Top1_ChemoR_SGPF_CAI_Plot.pdf', plot = cai_plot, 
       width = 6, height = 4, 
       units = "in", device = "pdf", 
       useDingbats = FALSE)

# Save list of all significantly enriched GO terms for each species, 
# for the top 1% of codon adapted genes
lapply(names(Top1.geneID), function(y){
  gost.results[[y]]$result %>%
    as_tibble() %>%
    dplyr::filter(significant == TRUE) %>%
    dplyr::select(-c(query, significant,parents)) %>%
    dplyr::arrange(p_value) %>%
    dplyr::relocate(term_id, source, term_name, .before = everything()) %>%
    write.csv(paste0("./Data/Genomic/",y,"_Top1_EnrichedTerms.csv"))
  
}) 
## list()
# Save Functional Enrichment Plots and Tables
lapply(names(Top1.geneID), function(y){
  gost.res <- gost.results[[y]]
  
  gost.ggplot <- gostplot(gost.res, interactive = F, capped = T) +
    labs(title = paste0(y, "top 1% Codon Adapted Genes")) +
    theme(plot.title.position = "plot",
          plot.caption.position = "plot",
          plot.title = element_text(face = "bold",
                                    size = 8, hjust = 0))
  gost.ggplot.pub<- publish_gostplot(gost.ggplot,
                                     highlight_terms = highTerms.overlap$term_id)
  ggsave(paste0('./Plots/',y,'_Top1_CAI_Plot.pdf'), plot = gost.ggplot.pub,
         width = 4, height = 5,
         units = "in", device = "pdf",
         useDingbats = FALSE)
  
  publish_gosttable(gost.res,
                    highlight_terms = highTerms.overlap$term_id,
                    filename = paste0('./Plots/',y,'_Top1_CAI_Table.pdf'))
  
})
## list()

Appendix: All code for this report

suppressPackageStartupMessages({
  library(knitr)
  library(rmarkdown)
  library(tidyverse)
  library(readxl)
  library(magrittr)
  library(DescTools)
  library(ggthemes)
  library(biomaRt)
  library(ggplot2)
  library(openxlsx)
  library(wrapr)
  library(Matching)
  library(cowplot)
  library(gprofiler2)
  library(clusterProfiler)
  source('fetch_cDNAseq.R')
})
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)
if (!file.exists(c("./Data/Genomic/tidy_genomic_CAI.csv"))) {
  if (!all(file.exists(c("./Data/Genomic/Ss_Genomic_CAI_Table.xlsx",
                         "./Data/Genomic/Sr_Genomic_CAI_Table.xlsx",
                         "./Data/Genomic/Sp_Genomic_CAI_Table.xlsx",
                         "./Data/Genomic/Sv_Genomic_CAI_Table.xlsx",
                         "./Data/Genomic/Ce_Genomic_CAI_Table.xlsx")))) {
    
    temp <- c(Ss = './Data/Genomic/Ss_Codon_Usage_Report.xlsx',
              Sr = './Data/Genomic/Sr_Codon_Usage_Report.xlsx',
              Sp = './Data/Genomic/Sp_Codon_Usage_Report.xlsx',
              Sv = './Data/Genomic/Sv_Codon_Usage_Report.xlsx',
              Ce = './Data/Genomic/Ce_Codon_Usage_Report.xlsx')
    dat.genomic <- sapply(temp, read_excel, 
                          sheet = 1, 
                          skip = 3, 
                          col_names = T, 
                          simplify = F, 
                          USE.NAMES = T)
    # This data contains transcript level information. In cases where there are 
    # multiple transcripts per gene, need to only include 
    # values from the longest transcript.
    dat.genomic.cleaned <- lapply(dat.genomic, function(x){
      x %>%
        dplyr::rename(transcriptID = geneID) %>%
        dplyr::mutate(geneID = str_remove_all(transcriptID, "\\.[0-9]$")) %>%
        dplyr::mutate(geneID = str_remove_all(geneID, "[a-c]$")) %>%
        dplyr::group_by(geneID) %>%
        dplyr::slice_max(n = 1, order_by = str_length(`cDNA sequence`), 
                         with_ties = FALSE) %>%
        dplyr::relocate(geneID, .before = everything())
      
    })
    
  }
}
if (!file.exists(c("./Data/Genomic/tidy_genomic_CAI.csv"))) {
  temp <- c("./Data/Genomic/Ss_Genomic_CAI_Table.xlsx",
            "./Data/Genomic/Sr_Genomic_CAI_Table.xlsx",
            "./Data/Genomic/Sp_Genomic_CAI_Table.xlsx",
            "./Data/Genomic/Sv_Genomic_CAI_Table.xlsx",
            "./Data/Genomic/Ce_Genomic_CAI_Table.xlsx")
  dat.genomic.cleaned <- sapply(temp, read_excel, 
                                sheet = 1, 
                                col_names = T, 
                                simplify = F, 
                                USE.NAMES = T)
  names(dat.genomic.cleaned) <- c("Ss", "Sr", "Sp", "Sv", "Ce")
  
  dat.genomic.df <- dat.genomic.cleaned %>%
    map("geneID") %>%
    unlist() %>%
    as_tibble_col(column_name = "geneID") %>%
    group_by(geneID)
  
  dat.genomic.df <- dat.genomic.cleaned %>%
    map("transcriptID") %>%
    unlist() %>%
    as_tibble_col(column_name = "transcriptID") %>%
    add_column(dat.genomic.df, .)
  
  dat.genomic.df <- dat.genomic.cleaned %>%
    map("Sr_CAI") %>%
    unlist() %>%
    as_tibble_col(column_name = "Sr_CAI") %>%
    add_column(dat.genomic.df, .)
  
  dat.genomic.df <- dat.genomic.cleaned %>%
    map("Ce_CAI") %>%
    unlist() %>%
    as_tibble_col(column_name = "Ce_CAI") %>%
    add_column(dat.genomic.df, .)
  
  dat.genomic.df <- dat.genomic.cleaned %>%
    map("GC") %>%
    unlist() %>%
    as_tibble_col(column_name = "GC") %>%
    add_column(dat.genomic.df, .)
  
  # Add the column that species which species the gene is from
  # by matching to the original lists of genes, 
  # the group
  dat.genomic.df <- dat.genomic.df %>%
    dplyr::mutate(species = case_when(
      transcriptID %in% dat.genomic$Ss$geneID ~ 'Ss',
      transcriptID %in% dat.genomic$Sr$geneID ~ 'Sr',
      transcriptID %in% dat.genomic$Sp$geneID ~ 'Sp',
      transcriptID %in% dat.genomic$Sv$geneID ~ 'Sv',
      transcriptID %in% dat.genomic$Ce$geneID ~ 'Ce',
    ))
  dat.genomic.df <- group_by(dat.genomic.df, species)
  
}
dat.genomic.df <- suppressWarnings(
  read.csv("./Data/Genomic/tidy_genomic_CAI.csv", 
           header = TRUE)) %>%
  as_tibble() 

species.specific.dat.df <- dat.genomic.df %>%
  dplyr::mutate(species_CAI = case_when(
    species == 'Ce' ~ Ce_CAI,
    species != 'Ce' ~ Sr_CAI
  )) %>%
  dplyr:: select(geneID, species_CAI, species) %>%
  group_by(species)


plot_distributions <- function(dat,
                               x,
                               title,
                               xlabel,
                               caption = "",
                               ylabel = "Density (scaled to maximum of 1)",
                               xlim = c(0,1),
                               ylim = c(0,1)) {
  dat %>%
    ggplot(aes(x = x, y = after_stat(ndensity), color = species)) +
    geom_freqpoly(bins = 50) +
    scale_color_manual(values = c("seagreen4", "steelblue4", 
                                  "coral4", "darkgoldenrod4", 
                                  "darkorchid4"),
                       name = "Species",
                       labels = c("C. elegans", "S. papillosus", 
                                  "S. ratti", "S. stercoralis",
                                  "S. venezuelensis")) +
    labs(title = title,
         subtitle = "Data includes all predicted coding sequences",
         x = xlabel,
         y = ylabel,
         caption = caption) +
    coord_equal(xlim = xlim, ylim = ylim) +
    theme_bw() +
    theme(plot.title.position = "plot",
          plot.caption.position = "panel",
          plot.title = element_text(face = "bold",
                                    size = 10, hjust = 0),
          plot.subtitle = element_text(size = 8),
          legend.title = element_text(size = 8),
          axis.title = element_text(size = 8),
          axis.text = element_text(size = 8),
          axis.text.x = element_text(angle = 45, hjust = 1))
}
species_adaptiveness_plot <- plot_distributions(species.specific.dat.df,
                                                species.specific.dat.df$species_CAI,
                                                "Species-wide codon adaptiveness",
                                                "Codon bias relative to \n genus coding usage rules (CAI)",
                                                "Strongyloides species values relative to S. ratti usage rules
       C. elegans values relative to C. elegans usage rules.")

species_adaptiveness_plot
# Kruskal-Wallis testing of genomic CAI
one.way <- kruskal.test(species_CAI ~ species, data = species.specific.dat.df)

# Posthoc Dunn tests with Bonferroni adjustment
post.hoc <- dunn.test::dunn.test(species.specific.dat.df$species_CAI, species.specific.dat.df$species, method = "bonferroni")

species.specific.GC.df <- dat.genomic.df %>%
  dplyr:: select(geneID, GC, species) %>%
  group_by(species)



species_GC_plot <- plot_distributions(species.specific.GC.df,
                                      species.specific.GC.df$GC,
                                      "Species-wide GC content",
                                      "GC content")

species_GC_plot
# Kruskal-Wallis testing of genomic GC
one.way <- kruskal.test(GC ~ species, data = species.specific.GC.df)

# Posthoc Dunn tests with Bonferroni adjustment
post.hoc <- dunn.test::dunn.test(species.specific.GC.df$GC, species.specific.GC.df$species, method = "bonferroni")

dat.Ensembl.IDs <- read_excel('./Data/Genomic/Hunt_Parasite_Ensembl_Compara.xlsx', 
                              sheet = 1, 
                              skip = 0, 
                              col_names = T) %>%
  pivot_longer(-`Compara family id`,
               values_to = "geneIDs") %>%
  dplyr::select(c(`Compara family id`, geneIDs)) %>%
  dplyr::rename(familyID = c(`Compara family id`), geneID = geneIDs) %>%
  dplyr::mutate(geneID = str_remove_all(geneID, "\\.[0-9]$")) %>%
  dplyr::mutate(geneID = str_remove_all(geneID, "[a-c]$"))

dat.Ensembl.families <- read_excel('./Data/Genomic/Hunt_Parasite_Ensembl_Compara.xlsx', 
                                   sheet = 2, 
                                   skip = 0, 
                                   col_names = T) %>%
  dplyr::select(c(`Compara family id`, `Description`))%>%
  dplyr::rename(familyID = c(`Compara family id`))

dat.SCPTAPS <- dat.Ensembl.families %>%
  dplyr::filter(str_detect(Description, "SCP/TAP")) %>%
  left_join(dat.Ensembl.IDs, by = "familyID") %>%
  dplyr::filter(str_detect(geneID, "SVE|SPAL|SSTP|SRAE")) %>%
  left_join(dat.genomic.df, by = "geneID")%>%
  dplyr::mutate(Description = factor(Description)) %>%
  dplyr::group_by(Description, species) %>%
  dplyr::select(geneID, Sr_CAI, species) %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv"))) 



dat.astacin <- dat.Ensembl.families %>%
  dplyr::filter(str_detect(Description, "Astacin")) %>%
  left_join(dat.Ensembl.IDs, by = "familyID") %>%
  dplyr::filter(str_detect(geneID, "SVE|SPAL|SSTP|SRAE")) %>%
  left_join(dat.genomic.df, by = "geneID")%>%
  dplyr::mutate(Description = factor(Description)) %>%
  dplyr::group_by(Description, species) %>%
  dplyr::select(geneID, Sr_CAI, species) %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv")))


temp <- c(
  Ss_GoI = './Data/Chemoreceptors/Ss_Chemoreceptor_Codon_Usage_Report.xlsx',
  Sr_GoI = './Data/Chemoreceptors/Sr_Chemoreceptor_Codon_Usage_Report.xlsx',
  Sv_GoI = './Data/Chemoreceptors/Sv_Chemoreceptor_Codon_Usage_Report.xlsx',
  Sp_GoI = './Data/Chemoreceptors/Sp_Chemoreceptor_Codon_Usage_Report.xlsx')

dat.Ss.GoI <- read_excel(temp[["Ss_GoI"]], 
                         sheet = 1, 
                         skip = 3, 
                         col_names = T) %>%
  dplyr::select(!c(GC,`cDNA sequence`)) %>%
  dplyr::mutate(species = "Ss")

dat.Sr.GoI <- read_excel(temp[["Sr_GoI"]], 
                         sheet = 1, 
                         skip = 3, 
                         col_names = T) %>%
  dplyr::select(!c(GC,`cDNA sequence`)) %>%
  dplyr::mutate(species = "Sr")

dat.Sv.GoI <- read_excel(temp[["Sv_GoI"]], 
                         sheet = 1, 
                         skip = 3, 
                         col_names = T) %>%
  dplyr::select(!c(GC,`cDNA sequence`)) %>%
  dplyr::mutate(species = "Sv")

dat.Sp.GoI <- read_excel(temp[["Sp_GoI"]], 
                         sheet = 1, 
                         skip = 3, 
                         col_names = T) %>%
  dplyr::select(!c(GC,`cDNA sequence`)) %>%
  dplyr::mutate(species = "Sp")


dat.chemoR <- rbind(dat.Ss.GoI, 
                    dat.Sr.GoI, 
                    dat.Sv.GoI, 
                    dat.Sp.GoI) %>%
  dplyr:: select(geneID, Sr_CAI, species) %>%
  dplyr::mutate(Description = factor("Chemoreceptor")) %>%
  dplyr::group_by(Description, species) %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv")))


dat.chemoR$geneID <- str_remove_all(dat.chemoR$geneID, "\\.[0-9]$")
dat.chemoR$geneID <- str_remove_all(dat.chemoR$geneID, "[a-z]$")
dat.TopOne <- dat.genomic.df %>%
  dplyr::filter(species != "Ce") %>%
  dplyr::select(geneID, Sr_CAI, species) %>%
  dplyr::mutate(Description = factor("Top1")) %>%
  dplyr::group_by(Description, species) %>%
  dplyr::slice_max(order_by = Sr_CAI, prop = .01) %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv")))

dat.randomSample <- dat.genomic.df %>%
  dplyr::filter(species != "Ce") %>%
  dplyr::select(geneID, Sr_CAI, species) %>%
  dplyr::mutate(Description = factor("RandomSample")) %>%
  dplyr::group_by(Description, species) %>%
  dplyr::sample_frac(.01) %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv")))


# Generate genome-wide Sr_CAI values that can be merged with the genes-of-interest datasets
dat.genome <- dat.genomic.df %>%
  dplyr::filter(species != "Ce") %>%
  dplyr::select(geneID, Sr_CAI, species) %>%
  dplyr::mutate(Description = factor("Genome-wide")) %>%
  dplyr::group_by(Description, species) %>%
  dplyr::mutate(species = factor(species, levels = c("Ss", "Sr", "Sp", "Sv")))


tbl <- dat.SCPTAPS %>%
  dplyr::select(Description, geneID, Sr_CAI, species) %>%
  rbind(.,dat.chemoR, dat.TopOne, dat.astacin, dat.randomSample) %>%
  dplyr::mutate(Description = factor(Description, levels = c("RandomSample", "Astacin", "SCP/TAP", "Chemoreceptor", "Top1")))

cai_plot <- ggplot(tbl, aes(Description,Sr_CAI, species)) +
  #geom_violin(mapping = aes(color = Description), trim = FALSE, alpha = 0.7) +
  geom_jitter(mapping = aes(color = Description), shape = 1) +
  stat_summary(fun = "median",
               geom = "point",
               shape = 20,
               size = 2,
               color = "black",
               show.legend = FALSE) +
  scale_color_hue (l=40)+
  facet_wrap(~species) +
  labs(title = "Species-specific codon adaptiveness",
       subtitle = "Random sample of 1% of genomic values, SCP/TAP genes & Astacins (i.e. parasitism-associated genes), Chemoreceptors genes, Top 1% of codon adapted genes",
       x = "Gene Set Identity",
       y = "Codon Bias relative to \n S. ratti usage rules (CAI)") +
  theme_bw() +
  
  theme(plot.title.position = "plot",
        plot.title = element_text(face = "bold",
                                  size = 8, hjust = 0),
        axis.title = element_text(size = 8),
        axis.text = element_text(size = 8),
        text = element_text(size = 8),
        title = element_text(size = 8),
        axis.text.x = element_text(angle = 45, hjust = 1))

cai_plot

# Carry out GO enrichment of discarded gene set using gProfiler2 ----
Top1.geneID <- dat.TopOne %>%
  ungroup() %>%
  group_by(species) %>%
  dplyr::group_map(~ {
    gost.res <- .x %>%
      dplyr::select(geneID, Sr_CAI) %>%
      unique() 
  }, .keep = TRUE)

names(Top1.geneID) <- levels(dat.TopOne$species)

gost.results <- lapply(names(Top1.geneID), function(y){
  organism = case_when (y == "Ss" ~ "ststerprjeb528",
                        y == "Sr" ~ "strattprjeb125",
                        y == "Sp" ~ "stpapiprjeb525",
                        y == "Sv" ~ "stveneprjeb530")
  gost.res <- gost(Top1.geneID[[y]]$geneID,
                   organism = organism, 
                   correction_method = "fdr")
})
names(gost.results) <- names(Top1.geneID)

# Find GO terms that are enriched with p-values equal or small than a threshold.
enriched.terms <- lapply(names(Top1.geneID), function(y){
  gost.results[[y]]$result %>%
    as_tibble() %>%
    dplyr::select(term_id, p_value)
}) 
names(enriched.terms) <- names(Top1.geneID)

# Select enriched GO terms that are common to all species
highTerms.overlap <- enriched.terms %>%
  bind_rows(.id = "species") %>%
  dplyr::filter(p_value <= 0.01) %>%
  group_by(term_id) %>%
  summarize(n= n()) %>%
  dplyr::filter(n >= 4)

gostplot(gost.results$Ss, interactive = T, capped = T) 
gostplot(gost.results$Sr, interactive = T, capped = T) 
gostplot(gost.results$Sp, interactive = T, capped = T) 
gostplot(gost.results$Sv, interactive = T, capped = T) 
dplyr::filter(gost.results$Ss$result, term_id %in% highTerms.overlap$term_id) %>%
  dplyr::select(term_id, term_name)

# Save cleaned versions of the genomic CAI value datasets
if (exists("dat.genomic.cleaned")) {
  suppressMessages(lapply(names(dat.genomic.cleaned), function(x){
    y <- dat.genomic.cleaned[[x]]
    write.xlsx(y, 
               file = paste0("./Data/Genomic/", x,"_Genomic_CAI_Table.xlsx")
    )
    
  }))
}

# Save a tidy version of the complied genomic CAI value dataset if it doesn't exist
if (!file.exists(c("./Data/Genomic/tidy_genomic_CAI.csv"))) {
  write.table(dat.genomic.df, 
              file = "./Data/Genomic/tidy_genomic_CAI.csv", 
              sep = ",", 
              col.names = T, 
              row.names = FALSE)
  
}

# Save distribution plot of codon adaptivness by species
ggsave('./Plots/Codon_Adaptiveness_Distributions_by_Species.pdf', 
       plot = species_adaptiveness_plot, 
       width = 5.5, height = 4, 
       units = "in", device = "pdf", 
       useDingbats = FALSE)

# Save distribution plot of GC content by species
ggsave('./Plots/GC_Distributions_by_Species.pdf', 
       plot = species_GC_plot, 
       width = 5.5, height = 4, 
       units = "in", device = "pdf", 
       useDingbats = FALSE)

# Save .csv lists of CAI values for functional datasets
lapply(levels(tbl$species), function(y){
   dplyr::filter(tbl, species == y) %>%
     pivot_wider(names_from = Description,
                 values_from = Sr_CAI) %>%
     
   write.csv(paste0("./Data/",y,"_FunctionGroupCAI.csv"))
  
})
# Save .csv lists of 1% of codon adapted genes per species
Top1.geneID <- dat.TopOne %>%
  ungroup() %>%
  group_by(species) %>%
  dplyr::group_map(~ {
    geneIDs <- .x %>%
      dplyr::select(geneID) %>%
      write.csv(paste0("./Data/Genomic/",.y$species,"_Top1_CAI.csv"))
  }, .keep = TRUE)


# Save plot of codon adaptivness in gene sets of interest
ggsave('./Plots/Top1_ChemoR_SGPF_CAI_Plot.pdf', plot = cai_plot, 
       width = 6, height = 4, 
       units = "in", device = "pdf", 
       useDingbats = FALSE)

# Save list of all significantly enriched GO terms for each species, 
# for the top 1% of codon adapted genes
lapply(names(Top1.geneID), function(y){
  gost.results[[y]]$result %>%
    as_tibble() %>%
    dplyr::filter(significant == TRUE) %>%
    dplyr::select(-c(query, significant,parents)) %>%
    dplyr::arrange(p_value) %>%
    dplyr::relocate(term_id, source, term_name, .before = everything()) %>%
    write.csv(paste0("./Data/Genomic/",y,"_Top1_EnrichedTerms.csv"))
  
}) 

# Save Functional Enrichment Plots and Tables
lapply(names(Top1.geneID), function(y){
  gost.res <- gost.results[[y]]
  
  gost.ggplot <- gostplot(gost.res, interactive = F, capped = T) +
    labs(title = paste0(y, "top 1% Codon Adapted Genes")) +
    theme(plot.title.position = "plot",
          plot.caption.position = "plot",
          plot.title = element_text(face = "bold",
                                    size = 8, hjust = 0))
  gost.ggplot.pub<- publish_gostplot(gost.ggplot,
                                     highlight_terms = highTerms.overlap$term_id)
  ggsave(paste0('./Plots/',y,'_Top1_CAI_Plot.pdf'), plot = gost.ggplot.pub,
         width = 4, height = 5,
         units = "in", device = "pdf",
         useDingbats = FALSE)
  
  publish_gosttable(gost.res,
                    highlight_terms = highTerms.overlap$term_id,
                    filename = paste0('./Plots/',y,'_Top1_CAI_Table.pdf'))
  
})